データ可視化〜まだExcelで疲弊してるの?

今回だけでggplot2を習得するのは難しいので本当に紹介程度です。 コピペでも全然OKです。 岩嵜先生のとても参考になる資料がよくまとまっています。 資料1 資料2

日本語の資料はいくらでもあるし、英語も含めればなんでもある。

データ可視化は重要。 Rは可視化に強い。特に人気の可視化パッケージであるggplot2を中心に学ぶ。
これでよく論文で見かける図が作れる!matplotlib?seaborn?知らないぁ。悪いがR以外は帰ってくれないか。最近のいけてる論文のfigは結構ggplot2で作成されている ggplot2の説明は難しいのでとにかくコードを実行してたくさん図を作っていきましょう。
イメージはレイヤを重ねていく感じ

基本 ggplot() # 使うデータを指定 aes() # x軸、y軸など指定 geom_() # 棒グラフか、散布図かなど theme_() # 背景や軸の見栄え

library(ggplot2)
head(diamonds) # ダイアモンドのデータ
## # A tibble: 6 x 10
##   carat cut       color clarity depth table price     x     y     z
##   <dbl> <ord>     <ord> <ord>   <dbl> <dbl> <int> <dbl> <dbl> <dbl>
## 1 0.23  Ideal     E     SI2      61.5    55   326  3.95  3.98  2.43
## 2 0.21  Premium   E     SI1      59.8    61   326  3.89  3.84  2.31
## 3 0.23  Good      E     VS1      56.9    65   327  4.05  4.07  2.31
## 4 0.290 Premium   I     VS2      62.4    58   334  4.2   4.23  2.63
## 5 0.31  Good      J     SI2      63.3    58   335  4.34  4.35  2.75
## 6 0.24  Very Good J     VVS2     62.8    57   336  3.94  3.96  2.48
head(mpg) # 自動車の燃費のデータ
## # A tibble: 6 x 11
##   manufacturer model displ  year   cyl trans  drv     cty   hwy fl    class
##   <chr>        <chr> <dbl> <int> <int> <chr>  <chr> <int> <int> <chr> <chr>
## 1 audi         a4      1.8  1999     4 auto(… f        18    29 p     comp…
## 2 audi         a4      1.8  1999     4 manua… f        21    29 p     comp…
## 3 audi         a4      2    2008     4 manua… f        20    31 p     comp…
## 4 audi         a4      2    2008     4 auto(… f        21    30 p     comp…
## 5 audi         a4      2.8  1999     6 auto(… f        16    26 p     comp…
## 6 audi         a4      2.8  1999     6 manua… f        18    26 p     comp…
ggplot(data = mpg) +
  geom_point(mapping = aes(x = displ, y = hwy))

ggplot(data = mpg) +
  geom_point(mapping = aes(x = displ, y = hwy, colour = class))

ggplot(data = mpg) +
  geom_point(mapping = aes(x = displ, y = hwy, shape = class))
## Warning: The shape palette can deal with a maximum of 6 discrete values
## because more than 6 becomes difficult to discriminate; you have 7.
## Consider specifying shapes manually if you must have them.
## Warning: Removed 62 rows containing missing values (geom_point).

ggplot(data = mpg) +
  geom_point(mapping = aes(x = displ, y = hwy, colour = "red"))

ggplot(data = mpg) +
  geom_point(mapping = aes(x = displ, y = hwy)) +
  facet_wrap(~class)

ggplot(data = mpg) +
  geom_point(mapping = aes(x = displ, y = hwy, colour = class)) +
  facet_wrap(~class)

ggplot(data = mpg) +
  geom_point(mapping = aes(x = displ, y = hwy))

ggplot(data = mpg) +
  geom_smooth(mapping = aes(x = displ, y = hwy))
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

ggplot(data = mpg) +
  geom_smooth(mapping = aes(x = displ, y = hwy, colour = drv))
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

ggplot(data = mpg, mapping = aes(x = displ, y = hwy)) +
  geom_point() +
  geom_smooth()
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

ggplot(data = mpg, mapping = aes(x = displ, y = hwy)) +
  geom_point(aes(colour = class)) +
  geom_smooth()
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

##########
ggplot(data = diamonds, mapping = aes(x = cut)) +
  geom_bar()

ggplot(data = diamonds, mapping = aes(x = cut, fill = cut)) +
  geom_bar()

ggplot(data = diamonds, mapping = aes(x = cut, y = price)) +
  geom_boxplot()

a <- ggplot(diamonds, aes(cut, price, fill = cut)) +
  geom_violin(trim = FALSE) +
  geom_boxplot(width = 0.08, fill = "white", outlier.size = FALSE) +
  theme_light()

a + coord_flip()

diamondsやmpgを使って基本的なプロット geom_boxなど

画像の保存方法 GUIでやるかggsave関数

これ以降は発展的内容

viridis colour

参照 視覚多様性の時代

# viridis theme
data <- data.frame(x = rnorm(10000), y = rnorm(10000))
# viridis
g1 <- ggplot(data, aes(x = x, y = y)) + geom_hex() + coord_fixed() +
  scale_fill_viridis_c() +
  theme_bw()
# magma
g2 <- ggplot(data, aes(x, y)) + geom_hex() + coord_fixed() +
  scale_fill_viridis_c(option = "magma") +
  theme_bw()
# inferno
g3 <- ggplot(data, aes(x, y)) + geom_hex() + coord_fixed() +
  scale_fill_viridis_c(option = "inferno") +
  theme_bw()
# plasma
g4 <- ggplot(data, aes(x, y)) + geom_hex() + coord_fixed() +
  scale_fill_viridis_c(option = "plasma") +
  theme_bw()
# cividis
g5 <- ggplot(data, aes(x, y)) + geom_hex() + coord_fixed() +
  scale_fill_viridis_c(option = "cividis") +
  theme_bw()
# default colour of ggplot2
g6 <- ggplot(data, aes(x, y)) + geom_hex() + coord_fixed() + theme_bw()
gridExtra::grid.arrange(g1, g2, g3, g4, g5, g6)

gplotsパッケージのheatmap.2関数を使ってヒートマップ作成

heatmap_2 <-  read.table("heatmap_2.txt", header = T, sep = "\t", row.names = 1)
head(heatmap_2)
##         GSM995221_AD01M001.CEL.gz GSM995222_AD01M002.CEL.gz
## Il1r2                    9.709961                  9.499286
## Il1rl1                  10.869457                 10.595727
## Il18rap                  8.661484                  8.596499
## Icos                    11.556232                 11.625647
## Fam129a                 10.466855                 10.324924
## Rgs16                   10.649960                 10.630798
##         GSM995223_AD01M003.CEL.gz GSM995224_AD01M004.CEL.gz
## Il1r2                    8.653889                  8.677315
## Il1rl1                   9.412765                  9.279133
## Il18rap                  7.772801                  7.761333
## Icos                    11.010891                 10.862727
## Fam129a                  9.845083                  9.640511
## Rgs16                   10.320202                 10.154570
##         GSM995225_AD01M005.CEL.gz GSM995226_AD01M006.CEL.gz
## Il1r2                    6.766613                  7.282546
## Il1rl1                   6.696681                  6.555741
## Il18rap                  6.975710                  6.711794
## Icos                     8.990778                  9.695695
## Fam129a                  8.449838                  9.240955
## Rgs16                    9.282057                  9.599733
##         GSM995227_AD01M007.CEL.gz GSM995228_AD01M008.CEL.gz
## Il1r2                    7.030207                  7.114095
## Il1rl1                   6.561950                  6.557071
## Il18rap                  6.987141                  6.770077
## Icos                     9.515691                  9.177716
## Fam129a                  8.827236                  8.570377
## Rgs16                    9.579403                  9.278177
colnames(heatmap_2) <- c("1", "2", "3", "4", "5", "6", "7", "8")
head(heatmap_2)
##                 1         2         3         4        5        6        7
## Il1r2    9.709961  9.499286  8.653889  8.677315 6.766613 7.282546 7.030207
## Il1rl1  10.869457 10.595727  9.412765  9.279133 6.696681 6.555741 6.561950
## Il18rap  8.661484  8.596499  7.772801  7.761333 6.975710 6.711794 6.987141
## Icos    11.556232 11.625647 11.010891 10.862727 8.990778 9.695695 9.515691
## Fam129a 10.466855 10.324924  9.845083  9.640511 8.449838 9.240955 8.827236
## Rgs16   10.649960 10.630798 10.320202 10.154570 9.282057 9.599733 9.579403
##                8
## Il1r2   7.114095
## Il1rl1  6.557071
## Il18rap 6.770077
## Icos    9.177716
## Fam129a 8.570377
## Rgs16   9.278177
class(heatmap_2)
## [1] "data.frame"
library(gplots)
## 
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
## 
##     lowess
library(viridis)
## Loading required package: viridisLite
# png("heatmap.png", width = 400, height = 800)
heatmap.2(as.matrix(heatmap_2), col = bluered(256), trace = "none", cexCol = 0.9, cexRow = 0.5)

heatmap.2(as.matrix(heatmap_2), col = bluered(256), trace = "none", cexCol = 0.9, cexRow = 0.5, scale = "row") # scaling

# Seurat colour
heatmap.2(as.matrix(heatmap_2), col = colorRampPalette(c("#FF00FF", "#000000", "#FFFF00")), trace = "none", cexCol = 0.9, cexRow = 0.5, scale = "row")

heatmap.2(as.matrix(heatmap_2), col = viridis, trace = "none", cexCol = 0.9, cexRow = 0.5)

heatmap.2(as.matrix(heatmap_2), col = viridis, trace = "none", cexCol = 0.9, cexRow = 0.5, scale = "row")

heatmap.2(as.matrix(heatmap_2), col = viridis, trace = "none", cexCol = 0.9, cexRow = 0.5, scale = "col")

heatmap.2(as.matrix(heatmap_2), col = magma, trace = "none", cexCol = 0.9, cexRow = 0.5)

# scaling
heatmap.2(as.matrix(heatmap_2), col = magma, trace = "none", cexCol = 0.9, cexRow = 0.5, scale = "row")

heatmap.2(as.matrix(heatmap_2), col = plasma, trace = "none", cexCol = 0.9, cexRow = 0.5, scale = "row")

heatmap.2(as.matrix(heatmap_2), col = inferno, trace = "none", cexCol = 0.9, cexRow = 0.5, scale = "row")

heatmap.2(as.matrix(heatmap_2), col = cividis, trace = "none", cexCol = 0.9, cexRow = 0.5, scale = "row")

heatmap.2(as.matrix(heatmap_2), col = viridis, trace = "none", cexCol = 0.9, cexRow = 0.5, scale = "row")

同じような作業を繰り返すなら簡単なプログラミングで効率的に この程度ならfor構文で構わない map関数でもできると思う(勉強不足)

for (i in c(viridis, magma, plasma, inferno, cividis)) {
  heatmap.2(as.matrix(heatmap_2), col = i, trace = "none", cexCol = 0.9, cexRow = 0.5, scale = "row")
}

complexheatmapを用いると複雑なヒートマップ描画できるが今回は割愛

発展 論文でよく見る図を再現してみる MA plot Scatter plot Volcano plot

散布図に遺伝子名を追加ggrepel

seqlogoとか genome browserもいける

地図かける geom_sf()

動画も作れる gganimate